# -*- coding: utf-8 -*-
# @Time: 2025/6/29 21:56
# @Author: wzd
# @Email: 2146333089@qq.com
# @File: process_chain.py
def process_ebook_pipeline():
    # 1. 原始文档扫描
    raw_dir = Path("./原始书籍")
    std_dir = Path("./标准化书籍")
    unified_dir = Path("./统一存储")

    # 2. 格式转换
    for file in raw_dir.glob("*.*"):
        if file.suffix.lower() in [".pdf", ".docx", ".pptx"]:
            md_file = convert_to_markdown(file, std_dir)

            # 3. 内容增强与清洗
            with open(md_file, "r", encoding="utf-8") as f:
                raw_text = f.read()

            cleaned_text = clean_text(raw_text)
            structured = structure_document(cleaned_text)

            # 4. 统一存储
            save_to_unified_format(structured, file)

    # 5. 构建向量库
    vector_db = Chroma(persist_directory="./vector_db")
    for parquet in unified_dir.glob("*.parquet"):
        chunks = create_chunks_from_unified_data(parquet)
        vector_db.add_documents(chunks)

    print("✅ 知识向量库构建完成！")